## # A tibble: 6 × 20
## X Country.Name Year Agriculture..value.added....…¹ CO2.emissions..metri…²
## <int> <chr> <int> <dbl> <dbl>
## 1 0 Afghanistan 1962 NA 0.0738
## 2 1 Afghanistan 1967 NA 0.124
## 3 2 Afghanistan 1972 NA 0.131
## 4 3 Afghanistan 1977 NA 0.183
## 5 4 Afghanistan 1982 NA 0.166
## 6 5 Afghanistan 1987 NA 0.276
## # ℹ abbreviated names: ¹Agriculture..value.added....of.GDP.,
## # ²CO2.emissions..metric.tons.per.capita.
## # ℹ 15 more variables:
## # Domestic.credit.provided.by.financial.sector....of.GDP. <dbl>,
## # Electric.power.consumption..kWh.per.capita. <dbl>,
## # Energy.use..kg.of.oil.equivalent.per.capita. <dbl>,
## # Exports.of.goods.and.services....of.GDP. <dbl>, …
# filtering the data to include rows where Year is equal to 1962
filtered_data1 <- data %>%
filter(Year == 1962)filtered_data1 %>%
ggplot(aes(x = CO2.emissions..metric.tons.per.capita., y = gdpPercap)) +
geom_point() #+cor_res <- cor.test(filtered_data1$CO2.emissions..metric.tons.per.capita., filtered_data1$gdpPercap)
cor_res##
## Pearson's product-moment correlation
##
## data: filtered_data1$CO2.emissions..metric.tons.per.capita. and filtered_data1$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
## [1] 1.128679e-46
The correlation coefficient of approximately 0.9261 suggests a very strong positive linear relationship between “CO2 emissions (metric tons per capita)” and “GDP per capita”. The confidence interval between 0.8934697 and 0.9489792 further supports this correlation.
data %>%
filter(complete.cases(CO2.emissions..metric.tons.per.capita., gdpPercap)) %>%
group_by(Year) %>%
summarise(cor = cor(CO2.emissions..metric.tons.per.capita., gdpPercap)) %>%
slice(which.max(cor))%>%
kbl()%>%
kable_material()| Year | cor |
|---|---|
| 1967 | 0.9387918 |
Max correlation between Co2 emissions and gdpPercapita was observed in 1967. Hence, subsetting the data to only include rows corresponding to year 1967.
filtered_data2 <- data %>%
filter(Year == 1967)
kbl(head(filtered_data2, 5)) %>%
kable_material(c("striped", "hover"))%>%
scroll_box(width="830px")| X | Country.Name | Year | Agriculture..value.added….of.GDP. | CO2.emissions..metric.tons.per.capita. | Domestic.credit.provided.by.financial.sector….of.GDP. | Electric.power.consumption..kWh.per.capita. | Energy.use..kg.of.oil.equivalent.per.capita. | Exports.of.goods.and.services….of.GDP. | Fertility.rate..total..births.per.woman. | GDP.growth..annual… | Imports.of.goods.and.services….of.GDP. | Industry..value.added….of.GDP. | Inflation..GDP.deflator..annual… | Life.expectancy.at.birth..total..years. | Population.density..people.per.sq..km.of.land.area. | Services..etc…value.added….of.GDP. | pop | continent | gdpPercap |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Afghanistan | 1967 | NA | 0.1237824 | 9.917662 | NA | NA | 6.772908 | 7.450 | NA | 14.20983 | NA | NA | 35.38941 | 15.881812 | NA | 11537966 | Asia | 836.1971 |
| 11 | Albania | 1967 | NA | 1.3637463 | NA | NA | NA | NA | 5.394 | NA | NA | NA | NA | 66.28722 | 71.737153 | NA | 1984060 | Europe | 2760.1969 |
| 21 | Algeria | 1967 | 10.33067 | 0.6321184 | 27.977088 | NA | NA | 23.434417 | 7.672 | 9.452963 | 21.63177 | 42.38589 | 1.312041 | 49.18751 | 5.606908 | 47.28345 | 12760499 | Africa | 3246.9918 |
| 31 | American Samoa | 1967 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 125.580000 | NA | NA | NA | |
| 41 | Andorra | 1967 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 44.159574 | NA | NA | NA |
# plotting a boxplot to visualise the relationship between these variables
data %>%
ggplot(aes(x = continent, y = Energy.use..kg.of.oil.equivalent.per.capita.)) +
geom_boxplot()Here, from above plot there seems to some differences in the energy across different continents, particularly - Asia, Europe and Oceania(highest median observed for Oceania). We will test significance of these differences statistically using ANOVA test.
aov_model <- aov(data$Energy.use..kg.of.oil.equivalent.per.capita. ~ data$continent)
summary(aov_model)## Df Sum Sq Mean Sq F value Pr(>F)
## data$continent 5 8.124e+08 162482656 21.88 <2e-16 ***
## Residuals 1404 1.043e+10 7426183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness
Here, the observed p-value is very small(<2e-16) and provides a strong evidence to reject null hypothesis. This indicates statistically significant differences in the energy use across the continents.
# density plot to visualise the differences in imports of goods and services in two continents
data %>%
filter(Year > 1990 & continent %in% c("Europe", "Asia")) %>%
ggplot(aes(x = Imports.of.goods.and.services....of.GDP., fill = continent)) +
geom_density(alpha = 0.3) +
labs(title = "Imports of goods and services between Europe and Asia")# stats
my_Data <- data %>%
filter(Year > 1990) %>%
select(continent, Imports.of.goods.and.services....of.GDP.) %>%
filter(continent %in% c("Europe", "Asia"))
t.test(Imports.of.goods.and.services....of.GDP. ~ continent, my_Data)##
## Welch Two Sample t-test
##
## data: Imports.of.goods.and.services....of.GDP. by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## -2.321099 12.433240
## sample estimates:
## mean in group Asia mean in group Europe
## 46.84531 41.78924
Based on the results, the p-value of 0.1776 is greater than the typical significance level of 0.05. This means we cannot reject the null hypothesis indicating there is no significant difference in import of goods and services between Asia and Europe.
data %>%
group_by(Country.Name) %>%
summarise(mean = mean(Population.density..people.per.sq..km.of.land.area.)) %>%
slice(which.max(mean)) %>%
kbl() %>%
kable_material("striped")| Country.Name | mean |
|---|---|
| Macao SAR, China | 14732.04 |
China has the highest ‘Population density (people per sq. km of land area)’ across all years.
glexpData <- data %>%
filter(Year %in% c(1962, 2007)) %>%
select(Year, Country.Name, Life.expectancy.at.birth..total..years.) %>%
group_by(Country.Name) %>%
pivot_wider(names_from = Year, values_from = Life.expectancy.at.birth..total..years.) %>%
mutate(diff_LE = `2007` - `1962`) %>%
arrange(desc(diff_LE))
kbl(head(glexpData, 5)) %>%
kable_material(c("striped", "hover"))| Country.Name | 1962 | 2007 | diff_LE |
|---|---|---|---|
| Maldives | 38.48356 | 75.39971 | 36.91615 |
| Bhutan | 33.09415 | 66.29310 | 33.19895 |
| Timor-Leste | 34.73905 | 65.82420 | 31.08515 |
| Tunisia | 43.34168 | 74.20244 | 30.86076 |
| Oman | 44.30051 | 75.12361 | 30.82310 |
Maldives saw greatest increase in Life expectancy at birth between year 1962 and 2007.